# Replication file for: "Beyond the Hazard Ratio: Generating        #
# Expected Durations from the Cox Proportional Hazards Model"       #
#                                                                   #
# Jonathan Kropko                                                   #
# University of Virginia                                            #
# jkropko@virginia.edu                                              #
#                                                                   #
# Jeffrey J. Harden                                                 #
# University of Notre Dame                                          #
# jeff.harden@nd.edu                                                #
#                                                                   #
# Journal article meta analysis                                     #
# Last update: September 8, 2016                                    #
#####################################################################
### Packages and Data ###
library(tm)

ma <- read.csv("meta_analysis.csv", stringsAsFactors = FALSE)
ma$hypotheses_text <- iconv(enc2utf8(ma$hypotheses_text), sub = "byte")

### Convert to corpus ###
ma.text <- paste(ma$hypotheses_text, collapse=" ")
ma.source <- VectorSource(ma.text)
corpus <- Corpus(ma.source)

corpus <- tm_map(corpus, content_transformer(tolower))
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, stripWhitespace)
corpus <- tm_map(corpus, removeWords, stopwords("english"))

### Document term matrix ###
dtm <- DocumentTermMatrix(corpus)
dtm2 <- as.matrix(dtm)

### Compute frequencies ###
frq <- colSums(dtm2)
frq <- sort(frq, decreasing = TRUE)

# write.csv(frq, "word_frq.csv")

### Count risk/duration words in each article ###
fw <- read.csv("frame_words.csv", stringsAsFactors = FALSE)

risk.words <- fw$word[fw$risk == 1] # Set of words in the risk frame
duration.words <- fw$word[fw$duration == 1 & fw$count > 0] # Set of words in the duration frame

# Loop through each article, saving hypotheses text
frame.type <- matrix(NA, nrow = nrow(ma), ncol = 3)

  for(i in 1:nrow(ma)){
  hyp.text <- paste(ma[i, "hypotheses_text"], collapse = " ")
  hyp.source <- VectorSource(hyp.text)
  hyp.corp <- Corpus(hyp.source)
  hyp.corp <- tm_map(hyp.corp, content_transformer(tolower))
  hyp.corp <- tm_map(hyp.corp, removePunctuation)
  hyp.corp <- tm_map(hyp.corp, stripWhitespace)
  hyp.corp <- tm_map(hyp.corp, removeWords, stopwords("english"))  
  hyp.dtm <- as.matrix(DocumentTermMatrix(hyp.corp))
  words <- attributes(hyp.dtm)$dimnames$Terms

  word.present <- matrix(NA, nrow = length(words), ncol = 2)
    for(j in 1:length(words)){
    word.present[j, 1] <- ifelse(words[j] %in% risk.words, 1, 0)
    word.present[j, 2] <- ifelse(words[j] %in% duration.words, 1, 0)
    }
  frame.present <- apply(word.present, 2, sum)
  frame.count <- c(sum(hyp.dtm[ , words[which(words %in% risk.words)]]),
                   sum(hyp.dtm[ , words[which(words %in% duration.words)]]))

  frame.type[i, 1] <- ifelse(frame.present[1] > frame.present[2], "risk", 
                       ifelse(frame.present[2] > frame.present[1], "duration", "equal"))
  frame.type[i, 2] <- ifelse(frame.count[1] > frame.count[2], "risk", 
                       ifelse(frame.count[2] > frame.count[1], "duration", "equal"))
  frame.type[i, 3] <- ifelse(frame.present[1] == 0, "no risk words", 
                       ifelse(frame.present[2] == 0, "no duration words", "both")) 
  }

table(frame.type[ , 1])
table(frame.type[ , 2])
table(frame.type[ , 3])

### Count of methods used ##
method.count <- table(c(ma$method1, ma$method2, ma$method3))[-1]
method.count

sum(method.count)/nrow(ma)

sum(nchar(ma$method1) > 0 & nchar(ma$method2) == 0 & nchar(ma$method3) == 0)
sum(nchar(ma$method2) > 0 & nchar(ma$method3) == 0)
sum(nchar(ma$method3) > 0)

